########################################################################
#   Michael Chaitkin, Justin Healy, and Shannon Milroy                 #
#   Code for Gov 2001                                                  #
#   What’s God Got To Do With It? Estimating How Religious             #
#   Affiliation Affects Individuals' Response to Extrinsic Rewards     #
########################################################################

########################################################################
#   Note about data: this replication requires one .dta files,         #
#   "XSect.dta." This was furnished directly by the authors of         #
#   the original paper. It should both be saved in the working         #
#   directory                                                          #
########################################################################
rm(list=ls())
#setwd()

library(car)
library(doBy)
library(estout)
library(foreign)
library(ggplot2)
library(gmodels)
library(Hmisc)
library(lmtest)
library(multcomp)
library(multiwayvcov)
library(plm)
library(stargazer)
library(survey)
library(cem)
library(MASS)
library(MatchIt)

########################################################################
#                                                                      #
#             Run analysis with packsSold as outcome                   #
#                                                                      #
########################################################################

########################################################################
#                     Load data                                        #  
########################################################################

## VARIABLE DEFINITIONS
# treat = which treatment arm
# packs = restocked number of packs
# packsSold = calculated (includes endownment of 12)
# barb = salon is a barbershop (binary)
# bar = salon is near a bar (binary)
# numEmpl = salon size (number of employees) (count)
# LnumEmpl = salon size (log number of employees) 
# cellpop = number of trained salons in the same area (count)
# otherProd = stylist sells other products in salon (binary)
# assetsLow = stylist is in bottom quartile of asset distribution (binary)
# lowsocio = stylist's socioeconomic status is low (binary) (proxied by either not speaking
# English or not having completed primary education)
# donation = stylist's dictator game donation (in Kwacha)
# donationHigh = stylist's dictator game donation above the median (binary)
# social = stylist's reported work motivation is intrinsic
# wage = monthly income of the salon (in Kwacha)
# litone = stylist can read and write in at least one language (binary)
# english = stylist can read and write in english
# numProducts = total number of products sold

##Read in data from Stata file
xsect.full <- read.dta("XSect.dta")

##Subset to relevant variables and recode
xsect.full$bar <- as.numeric(xsect.full$bar)-1
xsect.full$english <- as.numeric(xsect.full$english)-1
xsect.full$otherProd <- as.numeric(xsect.full$otherProd)-1
xsect <- subset(xsect.full, select = c("packsSold","treat","barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social", "roman", "cellNo"))

##Define median person
barb.med <- median(na.omit(xsect.full$barb))
bar.med <- median(na.omit(xsect.full$bar))
LnumEmpl.med <- median(na.omit(xsect.full$LnumEmpl))
cellPop.med <- median(na.omit(xsect.full$cellPop))
otherProd.med <- median(na.omit(xsect.full$otherProd))
assetsLow.med <- median(na.omit(xsect.full$assetsLow))
lowsocio.med <- median(na.omit(xsect.full$lowsocio))
donationHigh.med <- median(na.omit(xsect.full$donationHigh))
social.med <- median(na.omit(xsect.full$social))
roman.med <- median(na.omit(xsect.full$roman))

########################################################################
#                   Log likelihood function                            #  
########################################################################

##Negative binomial function
negbin.likelihood <- function(param, x, y){
  betas <- param[1:ncol(x)]
  gamgam <- param[ncol(x)+1]
  xb <- x%*%betas
  sum(lgamma(exp(gamgam)+y) - lgamma(exp(gamgam)) + y*xb + exp(gamgam)*gamgam 
      - (exp(gamgam)+y)*log(exp(xb)+exp(gamgam)))
}

########################################################################
#                           Begin analysis                            #  
########################################################################

########################################################################
#                         Linear model                                 #
########################################################################

##Create data set for linear model
xsect.linear <- xsect
xsect.linear <- na.omit(xsect.linear)
xsect.linear$treat <- as.factor(xsect.linear$treat)
xsect.linear$treat <- relevel(xsect.linear$treat, "PVT")

##Run linear model
linear.model <- lm(packsSold ~ treat + barb + bar + LnumEmpl + cellPop + otherProd +
                     assetsLow + lowsocio + donationHigh + social + roman, data = xsect.linear)

##Calculate clustered variance-covariance matrix
cluster.v <- cluster.vcov(linear.model,xsect.linear$cellNo)
se.cluster = sqrt(diag(cluster.v))
sigma2 <- var(linear.model$model$packsSold)

########################################################################
#                     Simulate linear model                            #  
########################################################################

##Set seed and simulation number
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate sigma
sim.betas <- rmvnorm(n=num.sims, mean=linear.model$coeff, sigma=cluster.v)

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med, assetsLow.med,lowsocio.med, donationHigh.med, social.med, roman.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate mu and set vector for expected value estimates
ev.control.lin <- c()
mu.control <- median.control%*%t(sim.betas)

ev.hpft.lin <- c()
mu.hpft <- median.hpft%*%t(sim.betas)

ev.lpft.lin <- c()
mu.lpft <- median.lpft%*%t(sim.betas)

ev.st.lin <- c()
mu.st <- median.st%*%t(sim.betas)

##Calculate standard deviation
sd <- sqrt(sigma2)

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnorm(n=num.sims, mean = mu.control[i], sd = sd)
  y.hpft <- rnorm(n=num.sims, mean = mu.hpft[i], sd = sd)
  y.lpft <- rnorm(n=num.sims, mean = mu.lpft[i], sd = sd)
  y.st <- rnorm(n=num.sims, mean = mu.st[i], sd = sd)
  
  ev.control.lin[i] <- mean(y.control)
  ev.hpft.lin[i] <- mean(y.hpft)
  ev.lpft.lin[i] <- mean(y.lpft)
  ev.st.lin[i] <- mean(y.st)
}

########################################################################
#                Negative binomial for full sample                    #  
########################################################################

##Remove observations with missing data
xsect.negbin <- xsect[complete.cases(xsect),]

##Create outcome variable vector
y <- as.matrix(subset(xsect.negbin, select = packsSold))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xsect.negbin)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xsect.negbin, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social", "roman"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameter values
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print coefficient values
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#                   Simulate negbin for full sample                    #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean =  opt.betas.negbin$par, sigma = V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med, assetsLow.med,lowsocio.med, donationHigh.med, social.med, roman.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin[i] <- mean(y.control)
  ev.hpft.negbin[i] <- mean(y.hpft)
  ev.lpft.negbin[i] <- mean(y.lpft)
  ev.st.negbin[i] <- mean(y.st)
}

########################################################################
#            Negative binomial with catholic population                #  
########################################################################

##Subset dataset to subpopulation
xs.cath <- subset(xsect,xsect[,c("roman")]==1)
xs.cath <- xs.cath[complete.cases(xs.cath),]

##Create outcome vector
y <- as.matrix(subset(xs.cath, select = packsSold))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xs.cath)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xs.cath, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameters
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print out coefficients
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#          Simulate negbin with catholic population                    #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean =  opt.betas.negbin$par, sigma = V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med,
                   assetsLow.med, lowsocio.med, donationHigh.med, social.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin.cath <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin.cath <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin.cath <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin.cath <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin.cath[i] <- mean(y.control)
  ev.hpft.negbin.cath[i] <- mean(y.hpft)
  ev.lpft.negbin.cath[i] <- mean(y.lpft)
  ev.st.negbin.cath[i] <- mean(y.st)
}

########################################################################
#        Negative binomial with non-catholic population               #  
########################################################################

##Subset dataset to subpopulation
xs.rest <- subset(xsect,xsect[,c("roman")]==0)
xs.rest <- xs.rest[complete.cases(xs.rest),]

##Create outcome vector
y <- as.matrix(subset(xs.rest, select = packsSold))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xs.rest)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xs.rest, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameters
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print coefficients
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#            Simulate negbin with non-catholic populations             #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean=opt.betas.negbin$par, sigma=V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med,
                   assetsLow.med, lowsocio.med, donationHigh.med, social.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin.ncath <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin.ncath <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin.ncath <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin.ncath <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin.ncath[i] <- mean(y.control)
  ev.hpft.negbin.ncath[i] <- mean(y.hpft)
  ev.lpft.negbin.ncath[i] <- mean(y.lpft)
  ev.st.negbin.ncath[i] <- mean(y.st)
}

########################################################################
#                    Create Overall Chart                              #  
########################################################################
summary <- c()
##Create dataset for chart
summary <- as.data.frame( cbind("Full Population \n (Ashraf et al.)","Voluntary", mean(ev.control.lin), quantile(ev.control.lin, c(0.025)), quantile(ev.control.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","High Financial", mean(ev.hpft.lin), quantile(ev.hpft.lin, c(0.025)), quantile(ev.hpft.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","Low Financial", mean(ev.lpft.lin), quantile(ev.lpft.lin, c(0.025)), quantile(ev.lpft.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","Star Reward", mean(ev.st.lin), quantile(ev.st.lin, c(0.025)), quantile(ev.st.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","Voluntary", mean(ev.control.negbin), quantile(ev.control.negbin, c(0.025)), quantile(ev.control.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","High Financial", mean(ev.hpft.negbin), quantile(ev.hpft.negbin, c(0.025)), quantile(ev.hpft.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","Low Financial", mean(ev.lpft.negbin), quantile(ev.lpft.negbin, c(0.025)), quantile(ev.lpft.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","Star Reward", mean(ev.st.negbin), quantile(ev.st.negbin, c(0.025)), quantile(ev.st.negbin, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","Voluntary", mean(ev.control.negbin.cath), quantile(ev.control.negbin.cath, c(0.025)), quantile(ev.control.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","High Financial", mean(ev.hpft.negbin.cath), quantile(ev.hpft.negbin.cath, c(0.025)), quantile(ev.hpft.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","Low Financial", mean(ev.lpft.negbin.cath), quantile(ev.lpft.negbin.cath, c(0.025)), quantile(ev.lpft.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","Star Reward", mean(ev.st.negbin.cath), quantile(ev.st.negbin.cath, c(0.025)), quantile(ev.st.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","Voluntary", mean(ev.control.negbin.ncath), quantile(ev.control.negbin.ncath, c(0.025)), quantile(ev.control.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","High Financial", mean(ev.hpft.negbin.ncath), quantile(ev.hpft.negbin.ncath, c(0.025)), quantile(ev.hpft.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","Low Financial", mean(ev.lpft.negbin.ncath), quantile(ev.lpft.negbin.ncath, c(0.025)), quantile(ev.lpft.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","Star Reward", mean(ev.st.negbin.ncath), quantile(ev.st.negbin.ncath, c(0.025)), quantile(ev.st.negbin.ncath, c(0.975))))

colnames(summary)[1] <- "Model"
colnames(summary)[2] <- "Treatment"
colnames(summary)[3] <- "Mean"
colnames(summary)[4] <- "Lower"
colnames(summary)[5] <- "Upper"

summary$Mean <- as.numeric(as.character(summary$Mean))
summary$Lower <- as.numeric(as.character(summary$Lower))
summary$Upper <- as.numeric(as.character(summary$Upper))

summary$Model <- relevel(summary$Model, "Catholics \n (negative binomial)")
summary$Model <- relevel(summary$Model, "Non-Catholics \n (negative binomial)")
summary$Model <- relevel(summary$Model, "Full Population \n (negative binomial)")
summary$Model <- relevel(summary$Model,"Full Population \n (Ashraf et al.)")

##Final graph
pd <- position_dodge(0.4)
p <- ggplot(summary, aes(Model, Mean, colour = Treatment))
model <- p + geom_point(data=summary, aes(x=Model, y=Mean), position=pd) + 
  theme(panel.background = element_rect(fill = "white"), 
        panel.grid.major.y = element_line(colour = "gray", linetype = "dotted"),
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(), 
        legend.title= element_blank(), 
        legend.background = element_rect(fill = "white"), 
        legend.position="top",
        legend.key = element_rect(fill = "white"), 
        axis.ticks = element_blank(), 
        axis.title = element_blank(), 
        axis.text.x = element_text(size=12)) +
  scale_y_continuous(breaks=seq(0,25,5)) +
  scale_colour_discrete(name="Treatment Group") +
  geom_errorbar(aes(ymax = Upper, ymin = Lower, width = 0.2), position=pd)
pdf("Fig1.pdf", width=8, height=4)
model 
dev.off()

########################################################################
#                                                                      #
#                                                                      #
#                          Robustness tests                            #
#                                                                      #
#                                                                      #
########################################################################

########################################################################
#                                                                      #
#                              Balance                                 #
#                                                                      #
########################################################################

## designate variables lists for the tasks to come
balance.vars <- c("treatN", "barb", "bar", "numEmpl", "cellPop", "otherProd", "assetsLow", "lowsocio", "donation", "social", "wage", "litone", "english", "numProducts")

model.vars <- c("treat", "packsSold", "barb", "bar", "numEmpl", "LnumEmpl", "cellPop", "otherProd", "assetsLow", "lowsocio", "donationHigh", "social", "roman")

#################################################
###         BALANCE TABLES                    ###
#################################################

##Create datasets with balance variables
xs.cath <- subset(xsect.full, roman==1)
xs.rest <- subset(xsect.full, roman==0)

## Create data frames with only the variables needed for balance tables
cath.bal <- xs.cath[balance.vars]
rest.bal <- xs.rest[balance.vars]

#######################
### Catholic sub-set ##
#######################
cath.means <- balance.vars[-1]
for(i in 1:length(unique(cath.bal$treatN))){
  sub <- subset(cath.bal, treatN==i)
  means <- c()
  for(j in 2:ncol(sub)){
    data <- na.omit(sub[,j])
    means[j-1] <- mean(data)
  }
  cath.means <- cbind(cath.means, means)
}
colnames(cath.means) <- c("variable", "HPFT", "LPFT", "PVT", "ST")
cath.means <- cath.means[,c(1,4,2,3,5)]

## subsetting data for each treatment arm
cath.pvt <- subset(cath.bal, treatN==3)
cath.pvt <- cath.pvt[-1]
cath.hpft <- subset(cath.bal, treatN==1)
cath.hpft <- cath.hpft[-1]
cath.lpft <- subset(cath.bal, treatN==2)
cath.lpft <- cath.lpft[-1]
cath.st <- subset(cath.bal, treatN==4)
cath.st <- cath.st[-1]

## computing pairwise t-tests between each treatment arm and the control (PVT)
pval <- c()
for(i in 1:ncol(cath.pvt)){
  t <- t.test(cath.hpft[,i], cath.pvt[,i])
  pval[i] <- t$p.value
}
cath.means <- cbind(cath.means, pval)
for(i in 1:ncol(cath.pvt)){
  t <- t.test(cath.lpft[,i], cath.pvt[,i])
  pval[i] <- t$p.value
}
cath.means <- cbind(cath.means, pval)
for(i in 1:ncol(cath.pvt)){
  t <- t.test(cath.st[,i], cath.pvt[,i])
  pval[i] <- t$p.value
}
cath.means <- cbind(cath.means, pval)
colnames(cath.means) <- c(colnames(cath.means)[1:5], "hpft.pvals", "lpft.pvals", "st.pvals")


###########################
### Non-Catholic sub-set ##
###########################
rest.means <- balance.vars[-1]
for(i in 1:length(unique(rest.bal$treatN))){
  sub <- subset(rest.bal, treatN==i)
  means <- c()
  for(j in 2:ncol(sub)){
    data <- na.omit(sub[,j])
    means[j-1] <- mean(data)
  }
  rest.means <- cbind(rest.means, means)
}
colnames(rest.means) <- c("variable", "HPFT", "LPFT", "PVT", "ST")
rest.means <- rest.means[,c(1,4,2,3,5)]

## subsetting data for each treatment arm
rest.pvt <- subset(rest.bal, treatN==3)
rest.pvt <- rest.pvt[-1]
rest.hpft <- subset(rest.bal, treatN==1)
rest.hpft <- rest.hpft[-1]
rest.lpft <- subset(rest.bal, treatN==2)
rest.lpft <- rest.lpft[-1]
rest.st <- subset(rest.bal, treatN==4)
rest.st <- rest.st[-1]

## computing pairwise t-tests between each treatment arm and the control (PVT)
pval <- c()
for(i in 1:ncol(rest.pvt)){
  t <- t.test(rest.hpft[,i], rest.pvt[,i])
  pval[i] <- t$p.value
}
rest.means <- cbind(rest.means, pval)
for(i in 1:ncol(rest.pvt)){
  t <- t.test(rest.lpft[,i], rest.pvt[,i])
  pval[i] <- t$p.value
}
rest.means <- cbind(rest.means, pval)
for(i in 1:ncol(rest.pvt)){
  t <- t.test(rest.st[,i], rest.pvt[,i])
  pval[i] <- t$p.value
}
rest.means <- cbind(rest.means, pval)
colnames(rest.means) <- c(colnames(rest.means)[1:5], "hpft.pvals", "lpft.pvals", "st.pvals")

########################################################################
#                                                                      #
#               Run analysis with packs as outcome                     #
#                                                                      #
########################################################################

rm(list=ls())

########################################################################
#                     Load data                                        #  
########################################################################

## VARIABLE DEFINITIONS
# treat = which treatment arm
# packs = restocked number of packs
# packsSold = calculated (includes endownment of 12)
# barb = salon is a barbershop (binary)
# bar = salon is near a bar (binary)
# numEmpl = salon size (number of employees) (count)
# LnumEmpl = salon size (log number of employees) 
# cellpop = number of trained salons in the same area (count)
# otherProd = stylist sells other products in salon (binary)
# assetsLow = stylist is in bottom quartile of asset distribution (binary)
# lowsocio = stylist's socioeconomic status is low (binary) (proxied by either not speaking
# English or not having completed primary education)
# donation = stylist's dictator game donation (in Kwacha)
# donationHigh = stylist's dictator game donation above the median (binary)
# social = stylist's reported work motivation is intrinsic
# wage = monthly income of the salon (in Kwacha)
# litone = stylist can read and write in at least one language (binary)
# english = stylist can read and write in english
# numProducts = total number of products sold

##Read in data from Stata file
xsect.full <- read.dta("XSect.dta")

##Subset to relevant variables and recode
xsect.full$bar <- as.numeric(xsect.full$bar)-1
xsect.full$english <- as.numeric(xsect.full$english)-1
xsect.full$otherProd <- as.numeric(xsect.full$otherProd)-1
xsect <- subset(xsect.full, select = c("packs","treat","barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social", "roman", "cellNo"))

##Define median person
barb.med <- median(na.omit(xsect.full$barb))
bar.med <- median(na.omit(xsect.full$bar))
LnumEmpl.med <- median(na.omit(xsect.full$LnumEmpl))
cellPop.med <- median(na.omit(xsect.full$cellPop))
otherProd.med <- median(na.omit(xsect.full$otherProd))
assetsLow.med <- median(na.omit(xsect.full$assetsLow))
lowsocio.med <- median(na.omit(xsect.full$lowsocio))
donationHigh.med <- median(na.omit(xsect.full$donationHigh))
social.med <- median(na.omit(xsect.full$social))
roman.med <- median(na.omit(xsect.full$roman))

########################################################################
#                   Log likelihood function                            #  
########################################################################

##Negative binomial function
negbin.likelihood <- function(param, x, y){
  betas <- param[1:ncol(x)]
  gamgam <- param[ncol(x)+1]
  xb <- x%*%betas
  sum(lgamma(exp(gamgam)+y) - lgamma(exp(gamgam)) + y*xb + exp(gamgam)*gamgam 
      - (exp(gamgam)+y)*log(exp(xb)+exp(gamgam)))
}

########################################################################
#                           Begin analysis                            #  
########################################################################

########################################################################
#                         Linear model                                 #
########################################################################

##Create data set for linear model
xsect.linear <- xsect
xsect.linear <- na.omit(xsect.linear)
xsect.linear$treat <- as.factor(xsect.linear$treat)
xsect.linear$treat <- relevel(xsect.linear$treat, "PVT")

##Run linear model
linear.model <- lm(packs ~ treat + barb + bar + LnumEmpl + cellPop + otherProd +
                     assetsLow + lowsocio + donationHigh + social + roman, data = xsect.linear)

##Calculate clustered variance-covariance matrix
cluster.v <- cluster.vcov(linear.model,xsect.linear$cellNo)
se.cluster = sqrt(diag(cluster.v))
sigma2 <- var(linear.model$model$packs)

########################################################################
#                     Simulate linear model                            #  
########################################################################

##Set seed and simulation number
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate sigma
sim.betas <- rmvnorm(n=num.sims, mean=linear.model$coeff, sigma=cluster.v)

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med, assetsLow.med,lowsocio.med, donationHigh.med, social.med, roman.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate mu and set vector for expected value estimates
ev.control.lin <- c()
mu.control <- median.control%*%t(sim.betas)

ev.hpft.lin <- c()
mu.hpft <- median.hpft%*%t(sim.betas)

ev.lpft.lin <- c()
mu.lpft <- median.lpft%*%t(sim.betas)

ev.st.lin <- c()
mu.st <- median.st%*%t(sim.betas)

##Calculate standard deviation
sd <- sqrt(sigma2)

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnorm(n=num.sims, mean = mu.control[i], sd = sd)
  y.hpft <- rnorm(n=num.sims, mean = mu.hpft[i], sd = sd)
  y.lpft <- rnorm(n=num.sims, mean = mu.lpft[i], sd = sd)
  y.st <- rnorm(n=num.sims, mean = mu.st[i], sd = sd)
  
  ev.control.lin[i] <- mean(y.control)
  ev.hpft.lin[i] <- mean(y.hpft)
  ev.lpft.lin[i] <- mean(y.lpft)
  ev.st.lin[i] <- mean(y.st)
}

########################################################################
#                Negative binomial for full sample                    #  
########################################################################

##Remove observations with missing data
xsect.negbin <- xsect[complete.cases(xsect),]

##Create outcome variable vector
y <- as.matrix(subset(xsect.negbin, select = packs))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xsect.negbin)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xsect.negbin, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social", "roman"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameter values
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print coefficient values
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#                   Simulate negbin for full sample                    #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean =  opt.betas.negbin$par, sigma = V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med, assetsLow.med,lowsocio.med, donationHigh.med, social.med, roman.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin[i] <- mean(y.control)
  ev.hpft.negbin[i] <- mean(y.hpft)
  ev.lpft.negbin[i] <- mean(y.lpft)
  ev.st.negbin[i] <- mean(y.st)
}

########################################################################
#            Negative binomial with catholic population                #  
########################################################################

##Subset dataset to subpopulation
xs.cath <- subset(xsect,xsect[,c("roman")]==1)
xs.cath <- xs.cath[complete.cases(xs.cath),]

##Create outcome vector
y <- as.matrix(subset(xs.cath, select = packs))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xs.cath)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xs.cath, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameters
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print out coefficients
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#          Simulate negbin with catholic population                    #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean =  opt.betas.negbin$par, sigma = V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med,
                   assetsLow.med, lowsocio.med, donationHigh.med, social.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin.cath <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin.cath <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin.cath <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin.cath <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin.cath[i] <- mean(y.control)
  ev.hpft.negbin.cath[i] <- mean(y.hpft)
  ev.lpft.negbin.cath[i] <- mean(y.lpft)
  ev.st.negbin.cath[i] <- mean(y.st)
}

########################################################################
#        Negative binomial with non-catholic population               #  
########################################################################

##Subset dataset to subpopulation
xs.rest <- subset(xsect,xsect[,c("roman")]==0)
xs.rest <- xs.rest[complete.cases(xs.rest),]

##Create outcome vector
y <- as.matrix(subset(xs.rest, select = packs))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xs.rest)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xs.rest, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameters
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print coefficients
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#              Negbin with non-catholic populations                   #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean=opt.betas.negbin$par, sigma=V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med,
                   assetsLow.med, lowsocio.med, donationHigh.med, social.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin.ncath <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin.ncath <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin.ncath <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin.ncath <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin.ncath[i] <- mean(y.control)
  ev.hpft.negbin.ncath[i] <- mean(y.hpft)
  ev.lpft.negbin.ncath[i] <- mean(y.lpft)
  ev.st.negbin.ncath[i] <- mean(y.st)
}

########################################################################
#                    Create Overall Chart                              #  
########################################################################
summary <- c()
##Create dataset for chart
summary <- as.data.frame(cbind("Full Population \n (Ashraf et al.)","Voluntary", mean(ev.control.lin), quantile(ev.control.lin, c(0.025)), quantile(ev.control.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","High", mean(ev.hpft.lin), quantile(ev.hpft.lin, c(0.025)), quantile(ev.hpft.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","Low", mean(ev.lpft.lin), quantile(ev.lpft.lin, c(0.025)), quantile(ev.lpft.lin, c(0.975))))

summary <- rbind(summary,cbind("Full Population \n (Ashraf et al.)","Star", mean(ev.st.lin), quantile(ev.st.lin, c(0.025)), quantile(ev.st.lin, c(0.975))))

summary <- rbind(summary,cbind("Full Population \n (negative binomial)","Voluntary", mean(ev.control.negbin), quantile(ev.control.negbin, c(0.025)), quantile(ev.control.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","High", mean(ev.hpft.negbin), quantile(ev.hpft.negbin, c(0.025)), quantile(ev.hpft.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","Low", mean(ev.lpft.negbin), quantile(ev.lpft.negbin, c(0.025)), quantile(ev.lpft.negbin, c(0.975))))

summary <- rbind(summary,cbind("Full Population \n (negative binomial)","Star", mean(ev.st.negbin), quantile(ev.st.negbin, c(0.025)), quantile(ev.st.negbin, c(0.975))))

summary <- rbind(summary,cbind("Catholics \n (negative binomial)","Voluntary", mean(ev.control.negbin.cath), quantile(ev.control.negbin.cath, c(0.025)), quantile(ev.control.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","High", mean(ev.hpft.negbin.cath), quantile(ev.hpft.negbin.cath, c(0.025)), quantile(ev.hpft.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","Low", mean(ev.lpft.negbin.cath), quantile(ev.lpft.negbin.cath, c(0.025)), quantile(ev.lpft.negbin.cath, c(0.975))))

summary <- rbind(summary,cbind("Catholics \n (negative binomial)","Star", mean(ev.st.negbin.cath), quantile(ev.st.negbin.cath, c(0.025)), quantile(ev.st.negbin.cath, c(0.975))))

summary <- rbind(summary,cbind("Non-Catholics \n (negative binomial)","Voluntary", mean(ev.control.negbin.ncath), quantile(ev.control.negbin.ncath, c(0.025)), quantile(ev.control.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","High", mean(ev.hpft.negbin.ncath), quantile(ev.hpft.negbin.ncath, c(0.025)), quantile(ev.hpft.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","Low", mean(ev.lpft.negbin.ncath), quantile(ev.lpft.negbin.ncath, c(0.025)), quantile(ev.lpft.negbin.ncath, c(0.975))))

summary <- rbind(summary,cbind("Non-Catholics \n (negative binomial)","Star", mean(ev.st.negbin.ncath), quantile(ev.st.negbin.ncath, c(0.025)), quantile(ev.st.negbin.ncath, c(0.975))))

colnames(summary)[1] <- "Model"
colnames(summary)[2] <- "Treatment"
colnames(summary)[3] <- "Mean"
colnames(summary)[4] <- "Lower"
colnames(summary)[5] <- "Upper"

summary$Mean <- as.numeric(as.character(summary$Mean))
summary$Lower <- as.numeric(as.character(summary$Lower))
summary$Upper <- as.numeric(as.character(summary$Upper))

summary$Model <- relevel(summary$Model, "Catholics \n (negative binomial)")
summary$Model <- relevel(summary$Model, "Non-Catholics \n (negative binomial)")
summary$Model <- relevel(summary$Model, "Full Population \n (negative binomial)")
summary$Model <- relevel(summary$Model,"Full Population \n (Ashraf et al.)")

##Final graph
pd <- position_dodge(0.4)
p <- ggplot(summary, aes(Model, Mean, colour = Treatment), xlim=c(9,22))
model <- p + geom_point(data=summary, aes(x=Model, y=Mean), position=pd) + 
  theme(panel.background = element_rect(fill = "white"), panel.grid.major.y = element_blank(), 
        panel.grid.major.y = element_line(colour = "gray", linetype = "dotted"), 
        panel.grid.minor.x = element_blank(), legend.background = element_rect(fill = "white"), 
        legend.key = element_rect(fill = "white"), 
        axis.ticks = element_blank(), axis.title = element_blank(), legend.title= element_blank(), 
        plot.title=element_text(color="gray10", size=15)) +
  scale_y_continuous(breaks=seq(0,30,5))
model + geom_errorbar(aes(ymax = Upper, ymin = Lower, width = 0.2), position=pd)

########################################################################
#                                                                      #
#      Run analysis with different values for median stylist           #
#                       Use packsSold                                  #
########################################################################

rm(list=ls())

########################################################################
#                     Load data                                        #  
########################################################################

## VARIABLE DEFINITIONS
# treat = which treatment arm
# packs = restocked number of packs
# packsSold = calculated (includes endownment of 12)
# barb = salon is a barbershop (binary)
# bar = salon is near a bar (binary)
# numEmpl = salon size (number of employees) (count)
# LnumEmpl = salon size (log number of employees) 
# cellpop = number of trained salons in the same area (count)
# otherProd = stylist sells other products in salon (binary)
# assetsLow = stylist is in bottom quartile of asset distribution (binary)
# lowsocio = stylist's socioeconomic status is low (binary) (proxied by either not speaking
# English or not having completed primary education)
# donation = stylist's dictator game donation (in Kwacha)
# donationHigh = stylist's dictator game donation above the median (binary)
# social = stylist's reported work motivation is intrinsic
# wage = monthly income of the salon (in Kwacha)
# litone = stylist can read and write in at least one language (binary)
# english = stylist can read and write in english
# numProducts = total number of products sold

##Read in data from Stata file
xsect.full <- read.dta("XSect.dta")

##Subset to relevant variables and recode
xsect.full$bar <- as.numeric(xsect.full$bar)-1
xsect.full$english <- as.numeric(xsect.full$english)-1
xsect.full$otherProd <- as.numeric(xsect.full$otherProd)-1
xsect <- subset(xsect.full, select = c("packsSold","treat","barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social", "roman", "cellNo"))

##Define median person
barb.med <- 1
bar.med <- 0
LnumEmpl.med <- median(na.omit(xsect.full$LnumEmpl))
cellPop.med <- median(na.omit(xsect.full$cellPop))
otherProd.med <- 1
assetsLow.med <- 1
lowsocio.med <- 1
donationHigh.med <- 0
social.med <- 0
roman.med <- 1

########################################################################
#                   Log likelihood function                            #  
########################################################################

##Negative binomial function
negbin.likelihood <- function(param, x, y){
  betas <- param[1:ncol(x)]
  gamgam <- param[ncol(x)+1]
  xb <- x%*%betas
  sum(lgamma(exp(gamgam)+y) - lgamma(exp(gamgam)) + y*xb + exp(gamgam)*gamgam 
      - (exp(gamgam)+y)*log(exp(xb)+exp(gamgam)))
}

########################################################################
#                           Begin analysis                            #  
########################################################################

########################################################################
#                         Linear model                                 #
########################################################################

##Create data set for linear model
xsect.linear <- xsect
xsect.linear <- na.omit(xsect.linear)
xsect.linear$treat <- as.factor(xsect.linear$treat)
xsect.linear$treat <- relevel(xsect.linear$treat, "PVT")

##Run linear model
linear.model <- lm(packsSold ~ treat + barb + bar + LnumEmpl + cellPop + otherProd +
                     assetsLow + lowsocio + donationHigh + social + roman, data = xsect.linear)

##Calculate clustered variance-covariance matrix
cluster.v <- cluster.vcov(linear.model,xsect.linear$cellNo)
se.cluster = sqrt(diag(cluster.v))
sigma2 <- var(linear.model$model$packs)

########################################################################
#                     Simulate linear model                            #  
########################################################################

##Set seed and simulation number
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate sigma
sim.betas <- rmvnorm(n=num.sims, mean=linear.model$coeff, sigma=cluster.v)

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med, assetsLow.med,lowsocio.med, donationHigh.med, social.med, roman.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate mu and set vector for expected value estimates
ev.control.lin <- c()
mu.control <- median.control%*%t(sim.betas)

ev.hpft.lin <- c()
mu.hpft <- median.hpft%*%t(sim.betas)

ev.lpft.lin <- c()
mu.lpft <- median.lpft%*%t(sim.betas)

ev.st.lin <- c()
mu.st <- median.st%*%t(sim.betas)

##Calculate standard deviation
sd <- sqrt(sigma2)

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnorm(n=num.sims, mean = mu.control[i], sd = sd)
  y.hpft <- rnorm(n=num.sims, mean = mu.hpft[i], sd = sd)
  y.lpft <- rnorm(n=num.sims, mean = mu.lpft[i], sd = sd)
  y.st <- rnorm(n=num.sims, mean = mu.st[i], sd = sd)
  
  ev.control.lin[i] <- mean(y.control)
  ev.hpft.lin[i] <- mean(y.hpft)
  ev.lpft.lin[i] <- mean(y.lpft)
  ev.st.lin[i] <- mean(y.st)
}

########################################################################
#                Negative binomial for full sample                    #  
########################################################################

##Remove observations with missing data
xsect.negbin <- xsect[complete.cases(xsect),]

##Create outcome variable vector
y <- as.matrix(subset(xsect.negbin, select = packsSold))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xsect.negbin)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xsect.negbin, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social", "roman"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameter values
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print coefficient values
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#                   Simulate negbin for full sample                    #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean =  opt.betas.negbin$par, sigma = V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med, assetsLow.med,lowsocio.med, donationHigh.med, social.med, roman.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin[i] <- mean(y.control)
  ev.hpft.negbin[i] <- mean(y.hpft)
  ev.lpft.negbin[i] <- mean(y.lpft)
  ev.st.negbin[i] <- mean(y.st)
}

########################################################################
#            Negative binomial with catholic population                #  
########################################################################

##Subset dataset to subpopulation
xs.cath <- subset(xsect,xsect[,c("roman")]==1)
xs.cath <- xs.cath[complete.cases(xs.cath),]

##Create outcome vector
y <- as.matrix(subset(xs.cath, select = packsSold))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xs.cath)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xs.cath, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameters
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print out coefficients
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#          Simulate negbin with catholic population                    #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean =  opt.betas.negbin$par, sigma = V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med,
                   assetsLow.med, lowsocio.med, donationHigh.med, social.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin.cath <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin.cath <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin.cath <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin.cath <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin.cath[i] <- mean(y.control)
  ev.hpft.negbin.cath[i] <- mean(y.hpft)
  ev.lpft.negbin.cath[i] <- mean(y.lpft)
  ev.st.negbin.cath[i] <- mean(y.st)
}

########################################################################
#        Negative binomial with non-catholic population               #  
########################################################################

##Subset dataset to subpopulation
xs.rest <- subset(xsect,xsect[,c("roman")]==0)
xs.rest <- xs.rest[complete.cases(xs.rest),]

##Create outcome vector
y <- as.matrix(subset(xs.rest, select = packsSold))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xs.rest)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xs.rest, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameters
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print coefficients
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#              Negbin with non-catholic populations                   #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean=opt.betas.negbin$par, sigma=V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med,
                   assetsLow.med, lowsocio.med, donationHigh.med, social.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin.ncath <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin.ncath <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin.ncath <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin.ncath <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin.ncath[i] <- mean(y.control)
  ev.hpft.negbin.ncath[i] <- mean(y.hpft)
  ev.lpft.negbin.ncath[i] <- mean(y.lpft)
  ev.st.negbin.ncath[i] <- mean(y.st)
}

########################################################################
#                    Create Overall Chart                              #  
########################################################################
summary <- c()
##Create dataset for chart
summary <- as.data.frame(cbind("Full Population \n (Ashraf et al.)","Voluntary", mean(ev.control.lin), quantile(ev.control.lin, c(0.025)), quantile(ev.control.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","High", mean(ev.hpft.lin), quantile(ev.hpft.lin, c(0.025)), quantile(ev.hpft.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","Low", mean(ev.lpft.lin), quantile(ev.lpft.lin, c(0.025)), quantile(ev.lpft.lin, c(0.975))))

summary <- rbind(summary,cbind("Full Population \n (Ashraf et al.)","Star", mean(ev.st.lin), quantile(ev.st.lin, c(0.025)), quantile(ev.st.lin, c(0.975))))

summary <- rbind(summary,cbind("Full Population \n (negative binomial)","Voluntary", mean(ev.control.negbin), quantile(ev.control.negbin, c(0.025)), quantile(ev.control.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","High", mean(ev.hpft.negbin), quantile(ev.hpft.negbin, c(0.025)), quantile(ev.hpft.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","Low", mean(ev.lpft.negbin), quantile(ev.lpft.negbin, c(0.025)), quantile(ev.lpft.negbin, c(0.975))))

summary <- rbind(summary,cbind("Full Population \n (negative binomial)","Star", mean(ev.st.negbin), quantile(ev.st.negbin, c(0.025)), quantile(ev.st.negbin, c(0.975))))

summary <- rbind(summary,cbind("Catholics \n (negative binomial)","Voluntary", mean(ev.control.negbin.cath), quantile(ev.control.negbin.cath, c(0.025)), quantile(ev.control.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","High", mean(ev.hpft.negbin.cath), quantile(ev.hpft.negbin.cath, c(0.025)), quantile(ev.hpft.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","Low", mean(ev.lpft.negbin.cath), quantile(ev.lpft.negbin.cath, c(0.025)), quantile(ev.lpft.negbin.cath, c(0.975))))

summary <- rbind(summary,cbind("Catholics \n (negative binomial)","Star", mean(ev.st.negbin.cath), quantile(ev.st.negbin.cath, c(0.025)), quantile(ev.st.negbin.cath, c(0.975))))

summary <- rbind(summary,cbind("Non-Catholics \n (negative binomial)","Voluntary", mean(ev.control.negbin.ncath), quantile(ev.control.negbin.ncath, c(0.025)), quantile(ev.control.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","High", mean(ev.hpft.negbin.ncath), quantile(ev.hpft.negbin.ncath, c(0.025)), quantile(ev.hpft.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","Low", mean(ev.lpft.negbin.ncath), quantile(ev.lpft.negbin.ncath, c(0.025)), quantile(ev.lpft.negbin.ncath, c(0.975))))

summary <- rbind(summary,cbind("Non-Catholics \n (negative binomial)","Star", mean(ev.st.negbin.ncath), quantile(ev.st.negbin.ncath, c(0.025)), quantile(ev.st.negbin.ncath, c(0.975))))

colnames(summary)[1] <- "Model"
colnames(summary)[2] <- "Treatment"
colnames(summary)[3] <- "Mean"
colnames(summary)[4] <- "Lower"
colnames(summary)[5] <- "Upper"

summary$Mean <- as.numeric(as.character(summary$Mean))
summary$Lower <- as.numeric(as.character(summary$Lower))
summary$Upper <- as.numeric(as.character(summary$Upper))

summary$Model <- relevel(summary$Model, "Catholics \n (negative binomial)")
summary$Model <- relevel(summary$Model, "Non-Catholics \n (negative binomial)")
summary$Model <- relevel(summary$Model, "Full Population \n (negative binomial)")
summary$Model <- relevel(summary$Model,"Full Population \n (Ashraf et al.)")

##Final graph
pd <- position_dodge(0.4)
p <- ggplot(summary, aes(Model, Mean, colour = Treatment), xlim=c(9,22))
model <- p + geom_point(data=summary, aes(x=Model, y=Mean), position=pd) + 
  theme(panel.background = element_rect(fill = "white"), panel.grid.major.y = element_blank(), 
        panel.grid.major.y = element_line(colour = "gray", linetype = "dotted"), 
        panel.grid.minor.x = element_blank(), legend.background = element_rect(fill = "white"), 
        legend.key = element_rect(fill = "white"), 
        axis.ticks = element_blank(), axis.title = element_blank(), legend.title= element_blank(), 
        plot.title=element_text(color="gray10", size=15)) +
  scale_y_continuous(breaks=seq(0,30,5))
model + geom_errorbar(aes(ymax = Upper, ymin = Lower, width = 0.2), position=pd)

########################################################################
#                                                                      #
#      Run analysis with extreme   values for median stylist           #
#           (low socio and high cellPop) Use packsSold                 #
########################################################################

rm(list=ls())

########################################################################
#                     Load data                                        #  
########################################################################

## VARIABLE DEFINITIONS
# treat = which treatment arm
# packs = restocked number of packs
# packsSold = calculated (includes endownment of 12)
# barb = salon is a barbershop (binary)
# bar = salon is near a bar (binary)
# numEmpl = salon size (number of employees) (count)
# LnumEmpl = salon size (log number of employees) 
# cellpop = number of trained salons in the same area (count)
# otherProd = stylist sells other products in salon (binary)
# assetsLow = stylist is in bottom quartile of asset distribution (binary)
# lowsocio = stylist's socioeconomic status is low (binary) (proxied by either not speaking
# English or not having completed primary education)
# donation = stylist's dictator game donation (in Kwacha)
# donationHigh = stylist's dictator game donation above the median (binary)
# social = stylist's reported work motivation is intrinsic
# wage = monthly income of the salon (in Kwacha)
# litone = stylist can read and write in at least one language (binary)
# english = stylist can read and write in english
# numProducts = total number of products sold

##Read in data from Stata file
xsect.full <- read.dta("XSect.dta")

##Subset to relevant variables and recode
xsect.full$bar <- as.numeric(xsect.full$bar)-1
xsect.full$english <- as.numeric(xsect.full$english)-1
xsect.full$otherProd <- as.numeric(xsect.full$otherProd)-1
xsect <- subset(xsect.full, select = c("packsSold","treat","barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social", "roman", "cellNo"))

##Define median person
barb.med <- median(na.omit(xsect.full$barb))
bar.med <- median(na.omit(xsect.full$bar))
LnumEmpl.med <- median(na.omit(xsect.full$LnumEmpl))
cellPop.med <- 30
otherProd.med <- median(na.omit(xsect.full$otherProd))
assetsLow.med <- median(na.omit(xsect.full$assetsLow))
lowsocio.med <- 1
donationHigh.med <- median(na.omit(xsect.full$donationHigh))
social.med <- median(na.omit(xsect.full$social))
roman.med <- median(na.omit(xsect.full$roman))

########################################################################
#                   Log likelihood function                            #  
########################################################################

##Negative binomial function
negbin.likelihood <- function(param, x, y){
  betas <- param[1:ncol(x)]
  gamgam <- param[ncol(x)+1]
  xb <- x%*%betas
  sum(lgamma(exp(gamgam)+y) - lgamma(exp(gamgam)) + y*xb + exp(gamgam)*gamgam 
      - (exp(gamgam)+y)*log(exp(xb)+exp(gamgam)))
}

########################################################################
#                           Begin analysis                            #  
########################################################################

########################################################################
#                         Linear model                                 #
########################################################################

##Create data set for linear model
xsect.linear <- xsect
xsect.linear <- na.omit(xsect.linear)
xsect.linear$treat <- as.factor(xsect.linear$treat)
xsect.linear$treat <- relevel(xsect.linear$treat, "PVT")

##Run linear model
linear.model <- lm(packsSold ~ treat + barb + bar + LnumEmpl + cellPop + otherProd +
                     assetsLow + lowsocio + donationHigh + social + roman, data = xsect.linear)

##Calculate clustered variance-covariance matrix
cluster.v <- cluster.vcov(linear.model,xsect.linear$cellNo)
se.cluster = sqrt(diag(cluster.v))
sigma2 <- var(linear.model$model$packs)

########################################################################
#                     Simulate linear model                            #  
########################################################################

##Set seed and simulation number
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate sigma
sim.betas <- rmvnorm(n=num.sims, mean=linear.model$coeff, sigma=cluster.v)

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med, assetsLow.med,lowsocio.med, donationHigh.med, social.med, roman.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate mu and set vector for expected value estimates
ev.control.lin <- c()
mu.control <- median.control%*%t(sim.betas)

ev.hpft.lin <- c()
mu.hpft <- median.hpft%*%t(sim.betas)

ev.lpft.lin <- c()
mu.lpft <- median.lpft%*%t(sim.betas)

ev.st.lin <- c()
mu.st <- median.st%*%t(sim.betas)

##Calculate standard deviation
sd <- sqrt(sigma2)

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnorm(n=num.sims, mean = mu.control[i], sd = sd)
  y.hpft <- rnorm(n=num.sims, mean = mu.hpft[i], sd = sd)
  y.lpft <- rnorm(n=num.sims, mean = mu.lpft[i], sd = sd)
  y.st <- rnorm(n=num.sims, mean = mu.st[i], sd = sd)
  
  ev.control.lin[i] <- mean(y.control)
  ev.hpft.lin[i] <- mean(y.hpft)
  ev.lpft.lin[i] <- mean(y.lpft)
  ev.st.lin[i] <- mean(y.st)
}

########################################################################
#                Negative binomial for full sample                    #  
########################################################################

##Remove observations with missing data
xsect.negbin <- xsect[complete.cases(xsect),]

##Create outcome variable vector
y <- as.matrix(subset(xsect.negbin, select = packsSold))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xsect.negbin)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xsect.negbin[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xsect.negbin, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social", "roman"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameter values
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print coefficient values
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#                   Simulate negbin for full sample                    #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean =  opt.betas.negbin$par, sigma = V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med, assetsLow.med,lowsocio.med, donationHigh.med, social.med, roman.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin[i] <- mean(y.control)
  ev.hpft.negbin[i] <- mean(y.hpft)
  ev.lpft.negbin[i] <- mean(y.lpft)
  ev.st.negbin[i] <- mean(y.st)
}

########################################################################
#            Negative binomial with catholic population                #  
########################################################################

##Subset dataset to subpopulation
xs.cath <- subset(xsect,xsect[,c("roman")]==1)
xs.cath <- xs.cath[complete.cases(xs.cath),]

##Create outcome vector
y <- as.matrix(subset(xs.cath, select = packsSold))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xs.cath)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xs.cath[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xs.cath, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameters
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print out coefficients
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#          Simulate negbin with catholic population                    #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean =  opt.betas.negbin$par, sigma = V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med,
                   assetsLow.med, lowsocio.med, donationHigh.med, social.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin.cath <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin.cath <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin.cath <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin.cath <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin.cath[i] <- mean(y.control)
  ev.hpft.negbin.cath[i] <- mean(y.hpft)
  ev.lpft.negbin.cath[i] <- mean(y.lpft)
  ev.st.negbin.cath[i] <- mean(y.st)
}

########################################################################
#        Negative binomial with non-catholic population               #  
########################################################################

##Subset dataset to subpopulation
xs.rest <- subset(xsect,xsect[,c("roman")]==0)
xs.rest <- xs.rest[complete.cases(xs.rest),]

##Create outcome vector
y <- as.matrix(subset(xs.rest, select = packsSold))

##Create covariate dataset
x <- as.matrix(rep(1,nrow(xs.rest)))
colnames(x)[1] <- "intercept"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="HPFT",1,0)))
colnames(x)[2] <- "HPFT"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="LPFT",1,0)))
colnames(x)[3] <- "LPFT"
x <- cbind(x,as.matrix(ifelse(xs.rest[,c("treat")]=="ST",1,0)))
colnames(x)[4] <- "ST"

covs <- subset(xs.rest, select = c("barb","bar", "LnumEmpl", "cellPop", "otherProd","assetsLow", "lowsocio", "donationHigh", "social"))

x_cov <- as.matrix(cbind(x,covs))

##Set parameters
param <- rep(0,ncol(x_cov)+1)

##Simulate betas
opt.betas.negbin <- optim(param, fn=negbin.likelihood,y=y,x=x_cov,control=list(fnscale=-1),method="BFGS",hessian=TRUE)

##Print coefficients
opt.betas.negbin$par

##Calculate standard errors
I = -1*opt.betas.negbin$hessian
V = solve(I) 
se = sqrt(diag(V))

########################################################################
#              Negbin with non-catholic populations                   #  
########################################################################

##Set seed and number of simulations
set.seed(12345)
num.sims <- 10000

##Draw betas and calculate theta
sim.betas.negbin <- rmvnorm(n=num.sims, mean=opt.betas.negbin$par, sigma=V)
theta <- exp(sim.betas.negbin[,ncol(x_cov)+1])

##Set treatment variables and other variables to average value
median.values <- c(barb.med, bar.med, LnumEmpl.med, cellPop.med, otherProd.med,
                   assetsLow.med, lowsocio.med, donationHigh.med, social.med)
median.control <- append(c(1,0,0,0),median.values)
median.hpft <- append(c(1,1,0,0),median.values)
median.lpft <- append(c(1,0,1,0),median.values)
median.st <- append(c(1,0,0,1),median.values)

##Calculate lambda and set vector for expected value estimates
ev.control.negbin.ncath <- c()
negbin.control <- exp(median.control%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.hpft.negbin.ncath <- c()
negbin.hpft <- exp(median.hpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.lpft.negbin.ncath <- c()
negbin.lpft <- exp(median.lpft%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

ev.st.negbin.ncath <- c()
negbin.st <- exp(median.st%*%t(sim.betas.negbin[,1:ncol(x_cov)]))

##Run simulations and calculated expected values
for(i in 1:num.sims){
  y.control <- rnbinom(n=num.sims, mu = negbin.control[i], size = theta[i])
  y.hpft <- rnbinom(n=num.sims, mu = negbin.hpft[i], size = theta[i])
  y.lpft <- rnbinom(n=num.sims, mu = negbin.lpft[i], size = theta[i])
  y.st <- rnbinom(n=num.sims, mu = negbin.st[i], size = theta[i])
  
  ev.control.negbin.ncath[i] <- mean(y.control)
  ev.hpft.negbin.ncath[i] <- mean(y.hpft)
  ev.lpft.negbin.ncath[i] <- mean(y.lpft)
  ev.st.negbin.ncath[i] <- mean(y.st)
}

########################################################################
#                    Create Overall Chart                              #  
########################################################################
summary <- c()
##Create dataset for chart
summary <- as.data.frame( cbind("Full Population \n (Ashraf et al.)","Voluntary", mean(ev.control.lin), quantile(ev.control.lin, c(0.025)), quantile(ev.control.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","High Financial", mean(ev.hpft.lin), quantile(ev.hpft.lin, c(0.025)), quantile(ev.hpft.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","Low Financial", mean(ev.lpft.lin), quantile(ev.lpft.lin, c(0.025)), quantile(ev.lpft.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (Ashraf et al.)","Star Reward", mean(ev.st.lin), quantile(ev.st.lin, c(0.025)), quantile(ev.st.lin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","Voluntary", mean(ev.control.negbin), quantile(ev.control.negbin, c(0.025)), quantile(ev.control.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","High Financial", mean(ev.hpft.negbin), quantile(ev.hpft.negbin, c(0.025)), quantile(ev.hpft.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","Low Financial", mean(ev.lpft.negbin), quantile(ev.lpft.negbin, c(0.025)), quantile(ev.lpft.negbin, c(0.975))))

summary <- rbind(summary, cbind("Full Population \n (negative binomial)","Star Reward", mean(ev.st.negbin), quantile(ev.st.negbin, c(0.025)), quantile(ev.st.negbin, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","Voluntary", mean(ev.control.negbin.cath), quantile(ev.control.negbin.cath, c(0.025)), quantile(ev.control.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","High Financial", mean(ev.hpft.negbin.cath), quantile(ev.hpft.negbin.cath, c(0.025)), quantile(ev.hpft.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","Low Financial", mean(ev.lpft.negbin.cath), quantile(ev.lpft.negbin.cath, c(0.025)), quantile(ev.lpft.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Catholics \n (negative binomial)","Star Reward", mean(ev.st.negbin.cath), quantile(ev.st.negbin.cath, c(0.025)), quantile(ev.st.negbin.cath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","Voluntary", mean(ev.control.negbin.ncath), quantile(ev.control.negbin.ncath, c(0.025)), quantile(ev.control.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","High Financial", mean(ev.hpft.negbin.ncath), quantile(ev.hpft.negbin.ncath, c(0.025)), quantile(ev.hpft.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","Low Financial", mean(ev.lpft.negbin.ncath), quantile(ev.lpft.negbin.ncath, c(0.025)), quantile(ev.lpft.negbin.ncath, c(0.975))))

summary <- rbind(summary, cbind("Non-Catholics \n (negative binomial)","Star Reward", mean(ev.st.negbin.ncath), quantile(ev.st.negbin.ncath, c(0.025)), quantile(ev.st.negbin.ncath, c(0.975))))

colnames(summary)[1] <- "Model"
colnames(summary)[2] <- "Treatment"
colnames(summary)[3] <- "Mean"
colnames(summary)[4] <- "Lower"
colnames(summary)[5] <- "Upper"

summary$Mean <- as.numeric(as.character(summary$Mean))
summary$Lower <- as.numeric(as.character(summary$Lower))
summary$Upper <- as.numeric(as.character(summary$Upper))

summary$Model <- relevel(summary$Model, "Catholics \n (negative binomial)")
summary$Model <- relevel(summary$Model, "Non-Catholics \n (negative binomial)")
summary$Model <- relevel(summary$Model, "Full Population \n (negative binomial)")
summary$Model <- relevel(summary$Model,"Full Population \n (Ashraf et al.)")

##Final graph
pd <- position_dodge(0.4)
p <- ggplot(summary, aes(Model, Mean, colour = Treatment))
model <- p + geom_point(data=summary, aes(x=Model, y=Mean), position=pd) + 
  theme(panel.background = element_rect(fill = "white"), 
        panel.grid.major.y = element_line(colour = "gray", linetype = "dotted"),
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(), 
        legend.title= element_blank(), 
        legend.background = element_rect(fill = "white"), 
        legend.position="top",
        legend.key = element_rect(fill = "white"), 
        axis.ticks = element_blank(), 
        axis.title = element_blank(), 
        axis.text.x = element_text(size=12)) +
  scale_y_continuous(breaks=seq(0,25,5)) +
  scale_colour_discrete(name="Treatment Group") +
  geom_errorbar(aes(ymax = Upper, ymin = Lower, width = 0.2), position=pd)
pdf("Fig1.pdf", width=8, height=4)
model 
dev.off()